📚 COSMI CODE INTERNSHIP
WEEK - 1
ABDULLAH IMRAN 📚

Table of Contents¶

  1. Question No. 1
  2. Question No. 2
  3. Question No. 3
  4. Question No. 4
  5. Question No. 5

Question No. 1

Implement a basic linear regression model from scratch using NumPy. Fit the model to a simple dataset and plot the regression line.¶

1.1 | Functions Created For Linear Regression

In [42]:
import numpy as np
import matplotlib.pyplot as plt

class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def fit(self, X, y):
        X = np.array(X).reshape(-1, 1)
        y = np.array(y).reshape(-1, 1)
        n = len(X)
        m, b = self._initialize_parameters()

        costs = []
        for _ in range(self.n_iterations):
            y_pred = self._predict(X, m, b)
            dm, db = self._compute_gradients(X, y, y_pred)
            m, b = self._update_parameters(m, b, dm, db)
            cost = self._compute_cost(y, y_pred)
            costs.append(cost)

        return m, b, costs

    def _initialize_parameters(self):
        return 0, 0

    def _predict(self, X, m, b):
        return X * m + b

    def _compute_gradients(self, X, y, y_pred):
        n = len(X)
        dm = (-2 / n) * np.sum(X * (y - y_pred))
        db = (-2 / n) * np.sum(y - y_pred)
        return dm, db

    def _update_parameters(self, m, b, dm, db):
        m -= self.learning_rate * dm
        b -= self.learning_rate * db
        return m, b

    def _compute_cost(self, y, y_pred):
        n = len(y)
        cost = (1 / (2 * n)) * np.sum((y - y_pred) ** 2)
        return cost

    def plot(self, X, y, m, b, costs):
        X = np.array(X).reshape(-1, 1)
        y_pred = self._predict(X, m, b)
        
        plt.figure(figsize=(14, 7))

        # Scatter plot of data points with regression line
        plt.subplot(1, 2, 1)
        plt.scatter(X, y, color='blue', label='Data points', edgecolor='k', s=70, linewidth=1, alpha= 0.9)
        plt.plot(X, y_pred, color='skyblue', label='Regression line', linewidth=2)
        plt.xlabel("X (Feature)", fontsize=12)
        plt.ylabel("y (Outcome)", fontsize=12)
        plt.title("Linear Regression with Gradient Descent", fontsize=14)
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.6)

        # Plot the cost history with annotations
        plt.subplot(1, 2, 2)
        plt.plot(range(1, len(costs) + 1), costs, color='green', linewidth=2, marker='o', markersize=5, alpha= 0.9)
        plt.xlabel("Iteration", fontsize=12)
        plt.ylabel("Cost", fontsize=12)
        plt.title("Cost vs Iteration", fontsize=14)
        
        # Annotate minimum cost
        min_cost_index = np.argmin(costs)
        plt.annotate(f"Min Cost: {costs[min_cost_index]:.5f}", 
                     xy=(min_cost_index + 1, costs[min_cost_index]), 
                     xytext=(min_cost_index + 50, costs[min_cost_index] + 0.5),
                     arrowprops=dict(facecolor='black', shrink=0.05),
                     fontsize=12, color='black')

        plt.grid(True, linestyle='--', alpha=0.6)

        plt.tight_layout()
        plt.show()

1.2 | Example Usuage & Results

In [43]:
# Provided dataset
X = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 1.3, 3.75, 2.25])

# Create an instance of the LinearRegression class
lr = LinearRegression()

# Fit the model to the dataset
m, b, costs = lr.fit(X, y)

# Plot the regression line and cost graph
lr.plot(X, y, m, b, costs)

Question No. 2

Perform data cleaning on a real-world dataset. Handle missing values, detect and remove outliers, and normalize/standardize the data using pandas¶

2.1 | Import Libraries & Load Dataset

In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df = pd.read_csv(r"C:\Users\acer\Downloads\cardata.csv")

2.2 | Analysis Report of Dataset

In [2]:
report = ProfileReport(df).to_notebook_iframe()
report
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

2.3 | Null Values Handling Portion

2.3.1 | Check for missing values¶

In [33]:
df.isnull().any()
Out[33]:
Car_Name          True
Year              True
Selling_Price     True
Present_Price     True
Kms_Driven        True
Fuel_Type        False
Seller_Type      False
Transmission      True
Owner            False
dtype: bool

2.3.2 | Checking Number of Missing Values In Each Column¶

In [34]:
df.isnull().sum()
Out[34]:
Car_Name         1
Year             1
Selling_Price    4
Present_Price    1
Kms_Driven       2
Fuel_Type        0
Seller_Type      0
Transmission     2
Owner            0
dtype: int64

2.3.3 | Filling Missing Values¶

In [39]:
#Impute missing values for numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Impute missing values for numerical columns using Median
for col in numerical_cols:
    if df[col].isnull().any():
        median_value = df[col].median()
        df[col].fillna(median_value, inplace=True)
        print(f'\nFilled missing values in {col} with median: {median_value}')

# Impute missing values for categorical columns using Mode
for col in categorical_cols:
    if df[col].isnull().any():
        mode_value = df[col].mode()[0]
        df[col].fillna(mode_value, inplace=True)
        print(f'\nFilled missing values in {col} with mode: {mode_value}')
Filled missing values in Year with median: 2014.0

Filled missing values in Selling_Price with median: 3.6

Filled missing values in Present_Price with median: 6.445

Filled missing values in Kms_Driven with median: 32000.0

Filled missing values in Car_Name with mode: city

Filled missing values in Transmission with mode: Manual

2.3.4 | Checking Missing Values After Imputation¶

In [37]:
df.isnull().any()
Out[37]:
Car_Name         False
Year             False
Selling_Price    False
Present_Price    False
Kms_Driven       False
Fuel_Type        False
Seller_Type      False
Transmission     False
Owner            False
dtype: bool

2.4 | Duplicate Values Handling Portion

2.4.1 | Checking Duplicates¶

In [38]:
df.duplicated().any()
Out[38]:
True

2.4.2 | Finding Duplicate Rows¶

In [72]:
df[df.duplicated()]
Out[72]:
Car_Name Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner
93 80 2015.0 13.65 22.95 40000.0 1 0 0 0
In [73]:
df[df.duplicated(keep=False)]
Out[73]:
Car_Name Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner
51 80 2015.0 13.65 22.95 40000.0 1 0 0 0
93 80 2015.0 13.65 22.95 40000.0 1 0 0 0

2.4.3 | Removing Duplicates¶

In [40]:
df1 = df.copy()
df1.drop_duplicates(inplace=True)
print(f"\033[91m Befor drop duplicate data: rows= {df.shape[0]} | columns= {df.shape[1]}")
 Befor drop duplicate data: rows= 301 | columns= 9

2.4.4 | Checking Dataset After Droping Duplicate Data¶

In [18]:
print(f"\033[92m After drop duplicate data: rows= {df1.shape[0]} | columns= {df1.shape[1]}")
 After drop duplicate data: rows= 300 | columns= 9

2.5 | Outliers Handling Portion

2.5.1 | Functions For Detecting Outliers & Plotting¶

In [45]:
def calculate_iqr(df):
    """
    Calculate Q1, Q3, and IQR for each column in the dataframe.
    """
    Q1 = df.quantile(0.25, numeric_only=True)
    Q3 = df.quantile(0.75, numeric_only=True)
    IQR = Q3 - Q1
    return Q1, Q3, IQR

def outlier_detect(df, col, Q1, Q3, IQR):
    """
    Detect outliers in numerical columns using IQR method.
    """
    q1_col = Q1[col]
    iqr_col = IQR[col]
    q3_col = Q3[col]
    return df[((df[col] < (q1_col - 1.5 * iqr_col)) | (df[col] > (q3_col + 1.5 * iqr_col)))]

def outlier_detect_normal(df, col):
    """
    Detect outliers in numerical columns using Z-score method.
    """
    m = df[col].mean()
    s = df[col].std()
    return df[((df[col] - m) / s).abs() > 3]

def replace_outliers(df, col, Q1, Q3, IQR):
    """
    Replace outliers in numerical columns with the bounds.
    """
    lower_bound = Q1[col] - 1.5 * IQR[col]
    upper_bound = Q3[col] + 1.5 * I
    QR[col]
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    print(f'Outliers replaced with bounds for column: {col}')

def process_dataset(df):
    """
    Process the dataset by detecting and handling outliers.
    """
    continuous_cols = df.select_dtypes(include=np.number).columns.tolist()
    Q1, Q3, IQR = calculate_iqr(df)
    
    outlier_cols = []
    for col in continuous_cols:
        if outlier_detect(df, col, Q1, Q3, IQR).shape[0] != 0:
            outlier_cols.append(col)
            
        print(f"IQR => {col}: {outlier_detect(df, col, Q1, Q3, IQR).shape[0]}")
        print(f"Z_Score => {col}: {outlier_detect_normal(df, col).shape[0]}")
        print("********************************")
    
    for col in outlier_cols:
        replace_outliers(df, col, Q1, Q3, IQR)
    
    print("\n********************************\n")
    
    for col in outlier_cols:
        print(f"Handling outliers for column: {col}")
    
    return df

def plot_outliers(df, cols):
    """
    Plot box plots and scatter plots for numerical columns with Z-Score and IQR outliers.
    """
    num_cols = len(cols)
    ncols = 5  # Number of columns for the subplot grid
    nrows = (num_cols + ncols - 1) // ncols  # Compute number of rows needed

    plt.figure(figsize=(15, 5 * nrows))
    
    for i, col in enumerate(cols):
        plt.subplot(nrows, ncols, i + 1)
        plt.boxplot(df[col].dropna(), vert=False)
        plt.title(f'Box Plot of {col}')
        plt.xlabel(col)

    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(15, 5 * nrows))
    
    for i, col in enumerate(cols):
        plt.subplot(nrows, ncols, i + 1)
        plt.scatter(df.index, df[col], label='Data points', alpha=0.6)
        z_outliers = outlier_detect_normal(df, col)
        iqr_outliers = outlier_detect(df, col, *calculate_iqr(df))
        
        plt.scatter(z_outliers.index, z_outliers[col], color='red', label='Z-Score Outliers', alpha=0.8)
        plt.scatter(iqr_outliers.index, iqr_outliers[col], color='orange', label='IQR Outliers', alpha=0.6)
        
        plt.title(col)
        plt.xlabel('Index')
        plt.ylabel(col)
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.show()

2.5.2 | Checking Number of Outliers In Each Feature¶

In [41]:
# Process and visualize outliers in the dataset
df2 = process_dataset(df1)
IQR => Year: 7
Z_Score => Year: 3
********************************
IQR => Selling_Price: 16
Z_Score => Selling_Price: 8
********************************
IQR => Present_Price: 13
Z_Score => Present_Price: 5
********************************
IQR => Kms_Driven: 8
Z_Score => Kms_Driven: 3
********************************
IQR => Owner: 11
Z_Score => Owner: 11
********************************
Outliers replaced with bounds for column: Year
Outliers replaced with bounds for column: Selling_Price
Outliers replaced with bounds for column: Present_Price
Outliers replaced with bounds for column: Kms_Driven
Outliers replaced with bounds for column: Owner

********************************

Handling outliers for column: Year
Handling outliers for column: Selling_Price
Handling outliers for column: Present_Price
Handling outliers for column: Kms_Driven
Handling outliers for column: Owner

2.5.3 | Plotting Outliers¶

In [46]:
# Plot the outliers
continuous_cols = df2.select_dtypes(include=np.number).columns.tolist()
plot_outliers(df2, continuous_cols)

2.6 | Numerical Columns Handling Portion

2.6.1 | Separate continuous and categorical columns¶

In [47]:
continuous_values = []
categorical_values = []

for column in df2.columns:
    if df2[column].dtype == 'int64' or df2[column].dtype == 'float64':
        continuous_values.append(column)
    else:
        categorical_values.append(column)
In [48]:
print("Continuous  Columns:" ,continuous_values)
print("Categorical Columns :", categorical_values)
Continuous  Columns: ['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner']
Categorical Columns : ['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission']

2.6.2 | Normalization¶

In [49]:
# Create scaler objects
mms = MinMaxScaler()  # For Min-Max scaling (Normalization)
df3 = df2.copy(deep=True)

# Apply scaling using a loop
for col in continuous_values:
    df3[col] = mms.fit_transform(df3[[col]]) 

df3.head()
Out[49]:
Car_Name Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner
0 ritz 0.666667 0.241523 0.233845 0.271852 Petrol Dealer Manual 0.0
1 sx4 0.583333 0.345564 0.409119 0.435990 Diesel Dealer Manual 0.0
2 ciaz 0.916667 0.531352 0.422874 0.065655 Petrol Dealer Manual 0.0
3 wagon r 0.416667 0.204366 0.169948 0.048215 Petrol Dealer Manual 0.0
4 swift 0.666667 0.334417 0.290643 0.323145 Diesel Dealer Manual 0.0

2.6.3 | Standaradization¶

In [50]:
mms = StandardScaler()  
df4 = df2.copy(deep=True)

# Apply scaling using a loop
for col in continuous_values:
    df4[col] = mms.fit_transform(df4[[col]]) 

df4.head()
Out[50]:
Car_Name Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type Transmission Owner
0 ritz 0.124223 -0.239678 -0.237779 -0.308272 Petrol Dealer Manual 0.0
1 sx4 -0.237592 0.140648 0.412316 0.369728 Diesel Dealer Manual 0.0
2 ciaz 1.209671 0.819803 0.463336 -1.160010 Petrol Dealer Manual 0.0
3 wagon r -0.961224 -0.375509 -0.474776 -1.232047 Petrol Dealer Manual 0.0
4 swift 0.124223 0.099899 -0.027115 -0.096397 Diesel Dealer Manual 0.0

Question No. 3

Implement feature selection using correlation matrix and mutual information. Visualize the important features.¶

3.1 | Import libraries

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
%matplotlib inline

df = pd.read_csv(r"C:\Users\acer\Downloads\cardata.csv")

3.2. Feature Selection Using Correlation Matrix

3.2.1 | Function To Plot Correlation Matrix & Find Highly Correlated Features¶

In [23]:
def plot_correlation_matrix(df, threshold):
    """
    Plot the correlation matrix and highlight features with correlation above the threshold.
    """
    # Select only numeric columns to avoid warning
    numeric_df = df.select_dtypes(include=[np.number])
    corr_matrix = numeric_df.corr()
    
    # Plot the correlation matrix
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, mask=np.triu(np.ones_like(corr_matrix, dtype=bool)))
    plt.title('Correlation Matrix')
    plt.show()

    # Find features with high correlation
    correlated_features = []
    for column in corr_matrix.columns:
        high_corr = corr_matrix.index[abs(corr_matrix[column]) > threshold].tolist()
        high_corr.remove(column) if column in high_corr else None
        correlated_features.extend(high_corr)
    correlated_features = list(set(correlated_features))
    
    num_features = len(correlated_features)
    
    return correlated_features, num_features

3.2.2 | Example Usuage With Correlated Features (Threshold > 0.6)¶

In [71]:
correlated_features, num_features = plot_correlation_matrix(df, threshold=0.6)
print(f"\nHighly correlated features (threshold > 0.6): {correlated_features} ({num_features} features)")

# Visualize the correlated features using pair plots
if num_features > 1:
    sns.pairplot(df[correlated_features])
    plt.suptitle('\n\nPair Plot of Highly Correlated Features', size=20)
    plt.show()

    # Visualize the correlated features using scatter plots
    for i, feature in enumerate(correlated_features):
        for j, other_feature in enumerate(correlated_features):
            if i < j:
                plt.figure(figsize=(8, 6))
                sns.scatterplot(x=df[feature], y=df[other_feature])
                plt.title(f'Scatter Plot of {feature} vs {other_feature}')
                plt.xlabel(feature)
                plt.ylabel(other_feature)
                plt.show()
else:
    print("No highly correlated features found above the threshold.")
Highly correlated features (threshold > 0.6): ['Present_Price', 'Kms_Driven', 'Seller_Type'] (3 features)

3.3 | Feature Selection Using Mutual Information

3.3.1 | Finding Best Features By Mutual Information Score & Printing Table¶

In [88]:
X = df1.drop(columns=['Owner', 'Selling_Price'])  # Exclude 'Owner' and 'Selling_Price'
y = df1['Selling_Price']

# Calculate mutual information for regression
mutual_info = mutual_info_regression(X, y)

# Create a DataFrame for feature importance
mutual_info_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mutual_info})
mutual_info_df.sort_values(by='Mutual Information', ascending=False, inplace=True)

# Display the mutual information DataFrame in a table
print(mutual_info_df)

# Visualize the table using matplotlib
fig, ax = plt.subplots(figsize=(12, 8))  # Set the figure size
ax.axis('off')  # Hide the axes

# Create a table plot
table = ax.table(cellText=mutual_info_df.values, colLabels=mutual_info_df.columns, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.2, 1.2)

plt.title('Mutual Information Table', fontsize=16)
plt.show()
         Feature  Mutual Information
2  Present_Price            1.253503
0       Car_Name            0.972751
5    Seller_Type            0.575705
1           Year            0.265208
4      Fuel_Type            0.166297
3     Kms_Driven            0.129226
6   Transmission            0.122107

3.3.2 | Plotting Feature Importance Based on Mutual Information¶

In [85]:
# Bar plot for feature importance
plt.figure(figsize=(9, 7))
sns.barplot(x='Mutual Information', y='Feature', data=mutual_info_df, palette='viridis')
plt.title('Feature Importance based on Mutual Information')
plt.xlabel('Mutual Information')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

3.3.3 | Plotting Feature Importance vs. Mutual Information¶

In [87]:
# Histogram for mutual information distribution
plt.figure(figsize=(7, 5))
sns.histplot(mutual_info_df['Mutual Information'], kde=True, color='skyblue')
plt.title('Mutual Information Distribution')
plt.xlabel('Mutual Information')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Scatter plot for feature importance vs. mutual information
plt.figure(figsize=(7, 5))
sns.scatterplot(x='Mutual Information', y='Feature', data=mutual_info_df, hue='Mutual Information', palette='coolwarm', s=100)
plt.title('Feature Importance vs. Mutual Information')
plt.xlabel('Mutual Information')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

Question No. 4

Conduct exploratory data analysis (EDA) on a dataset. Generate insightful visualizations using matplotlib and seaborn.¶

In this Question, I did Exploratory Data Analysis Using 12 Different Kinds of Plots.¶

4.1 | Import Libraries & Dataset

In [51]:
import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df = pd.read_csv(r"C:\Users\acer\Downloads\cardata.csv")

4.2 | Numerical Features Distribution

In [53]:
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(7, 5))
    sns.histplot(df[column], kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

4.3 | Categorical Features Distribution

In [12]:
# Define categorical columns
categorical_columns = ['Fuel_Type', 'Seller_Type', 'Transmission']

# Plot pie charts for categorical columns
for column in categorical_columns:
    plt.figure(figsize=(7, 5))
    colors = sns.color_palette('Set3', len(df[column].unique()))
    df[column].value_counts().plot.pie(
        autopct='%1.1f%%', 
        colors=colors, 
        startangle=90, 
        wedgeprops={'alpha':0.8},
        textprops={'color':"black"}
    )
    plt.title(f'Distribution of {column}', fontsize=16)
    plt.ylabel('')
    plt.show()

4.4 | Scatter Plot for Relationships Between Two Numerical Variables

In [97]:
# 3. Scatter Plot for Relationships Between Two Numerical Variables
plt.figure(figsize=(7, 5))
sns.scatterplot(x='Present_Price', y='Selling_Price', hue='Fuel_Type', data=df, palette='viridis')
plt.title('Scatter Plot of Present Price vs. Selling Price')
plt.xlabel('Present Price')
plt.ylabel('Selling Price')
plt.show()

4.5 | Heatmap for Correlation Matrix

In [96]:
# 4. Heatmap for Correlation Matrix
plt.figure(figsize=(12, 8))
corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

4.6 | Bar Plot for Categorical vs. Numerical

In [100]:
# 5. Bar Plot for Categorical vs. Numerical
plt.figure(figsize=(7, 5))
sns.barplot(x='Fuel_Type', y='Selling_Price', data=df, palette='viridis')
plt.title('Bar Plot of Selling Price by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Selling Price')
plt.show()

4.7 | Line Plot for Trends

In [102]:
# 6. Line Plot for Trends
plt.figure(figsize=(7, 5))
sns.lineplot(x='Year', y='Selling_Price', data=df, marker='o')
plt.title('Line Plot of Selling Price over Years')
plt.xlabel('Year')
plt.ylabel('Selling Price')
plt.show()

4.8 | Joint Plot

In [104]:
# 7. Joint Plot for Detailed Relationships
sns.jointplot(x='Present_Price', y='Selling_Price', data=df, hue='Fuel_Type', palette='viridis')
plt.suptitle('Joint Plot of Present Price vs. Selling Price', y=1.02)
plt.show()

4.9 | 3D Scatter Plot

In [108]:
# 8. 3D Scatter Plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Plotting the scatter plot 
sc = ax.scatter(df['Present_Price'], df['Kms_Driven'], df['Selling_Price'], 
                c=df['Year'], cmap='viridis', s=50, alpha=0.8, edgecolors='w', linewidth=0.5)

# Adding labels and title
ax.set_xlabel('Present Price', fontsize=12)
ax.set_ylabel('Kms Driven', fontsize=12)
ax.set_zlabel('Selling Price', fontsize=12)
ax.set_title('3D Scatter Plot of Present Price, Kms Driven, and Selling Price', fontsize=14)

# Adding a color bar
cbar = plt.colorbar(sc, ax=ax, pad=0.1)
cbar.set_label('Year', rotation=270, labelpad=15)

# Adding grid for better readability
ax.grid(True)

# Setting a specific view angle
ax.view_init(elev=20., azim=30)

# Show plot
plt.show()

4.10 | Scatter Regression Plot

In [17]:
# 9. Scatter Regression Plot
plt.figure(figsize=(7, 5))
sns.regplot(x='Present_Price', y='Selling_Price', data=df, scatter_kws={'color': 'blue'}, line_kws={'color': 'yellow'})
plt.title('Regression Plot of Present Price vs Selling Price')
plt.show()

4.11 | Bubble Plot

In [19]:
# 10. Bubble Plot
plt.figure(figsize=(7, 5))
sns.scatterplot(x='Present_Price', y='Selling_Price', size='Kms_Driven', hue='Year', data=df, alpha=0.9, sizes=(20, 200), palette='viridis')
plt.title('Bubble Plot of Present Price vs Selling Price with Kms Driven')
plt.show()

4.12 | Pair Plot

In [31]:
# 11. Multi-Feature Pair Plot
sns.pairplot(df, hue='Fuel_Type', palette='coolwarm', diag_kind='kde', height=3, aspect=1.3, plot_kws={'alpha': 0.9, 's': 50, 'edgecolor': 'w', 'linewidth': 0.5})
plt.show()

4.13 | Facet Grid Plot

In [28]:
# 12. Facet Grid Plot
g = sns.FacetGrid(df, col='Fuel_Type', hue='Transmission', height=5, aspect=1.2, palette='Set1', col_wrap=2)
g.map(sns.scatterplot, 'Present_Price', 'Selling_Price', s=100, alpha=0.3, edgecolor='w', linewidth=0.5)
g.add_legend()
g.set_axis_labels('Present Price', 'Selling Price')
g.fig.suptitle('Facet Grid Plot of Present Price vs. Selling Price by Fuel Type and Transmission', y=1.05)

# Additional customizations
for ax in g.axes.flat:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.show()

Question No. 5

Apply PCA (Principal Component Analysis) to reduce the dimensionality of a dataset and visualize the results in 2D/3D.¶

5.1 | Import Libraries & Data

In [57]:
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt



df = pd.read_csv("C:\\Users\\acer\\Downloads\\data.csv")

5.2 | Basic Data Analysis

In [58]:
df .head()
Out[58]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN

5 rows × 33 columns

In [61]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
In [62]:
df.shape
Out[62]:
(569, 33)
In [59]:
df.drop("Unnamed: 32",axis=1,inplace=True)
df.drop("diagnosis",axis=1,inplace=True)
In [64]:
df.shape
Out[64]:
(569, 31)

5.3 | Standardize Features For PCA

In [60]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
In [64]:
df_pca = pd.DataFrame(scaled_df, columns=df.columns)
df_pca.head()
Out[64]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 -0.236405 1.097064 -2.073335 1.269934 0.984375 1.568466 3.283515 2.652874 2.532475 2.217515 ... 1.886690 -1.359293 2.303601 2.001237 1.307686 2.616665 2.109526 2.296076 2.750622 1.937015
1 -0.236403 1.829821 -0.353632 1.685955 1.908708 -0.826962 -0.487072 -0.023846 0.548144 0.001392 ... 1.805927 -0.369203 1.535126 1.890489 -0.375612 -0.430444 -0.146749 1.087084 -0.243890 0.281190
2 0.431741 1.579888 0.456187 1.566503 1.558884 0.942210 1.052926 1.363478 2.037231 0.939685 ... 1.511870 -0.023974 1.347475 1.456285 0.527407 1.082932 0.854974 1.955000 1.152255 0.201391
3 0.432121 -0.768909 0.253732 -0.592687 -0.764464 3.283553 3.402909 1.915897 1.451707 2.867383 ... -0.281464 0.133984 -0.249939 -0.550021 3.394275 3.893397 1.989588 2.175786 6.046041 4.935010
4 0.432201 1.750297 -1.151816 1.776573 1.826229 0.280372 0.539340 1.371011 1.428493 -0.009560 ... 1.298575 -1.466770 1.338539 1.220724 0.220556 -0.313395 0.613179 0.729259 -0.868353 -0.397100

5 rows × 31 columns

In [65]:
scaled_df
Out[65]:
array([[-0.23640517,  1.09706398, -2.07333501, ...,  2.29607613,
         2.75062224,  1.93701461],
       [-0.23640344,  1.82982061, -0.35363241, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 0.43174109,  1.57988811,  0.45618695, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [-0.23572747,  0.70228425,  2.0455738 , ...,  0.41406869,
        -1.10454895, -0.31840916],
       [-0.23572517,  1.83834103,  2.33645719, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-0.24240586, -1.80840125,  1.22179204, ..., -1.74506282,
        -0.04813821, -0.75120669]])

5.4. Principal Component Analysis

PCA is:

  • Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space.
  • Unsupervised Machine Learning.
  • A transformation of your data that attempts to find out what features explain the most variance in your data. For example:
In [66]:
# Perform PCA
pca = PCA()
pca_components = pca.fit_transform(scaled_df)

# Creating a DataFrame for the principal components
pca_df = pd.DataFrame(data=pca_components, columns=[f"PC{i+1}" for i in range(pca_components.shape[1])])
pca_df.head()
Out[66]:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 ... PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30 PC31
0 9.183200 1.971271 -1.171625 -3.639332 1.193098 1.372280 0.371844 2.180529 -0.231610 -0.090396 ... -0.107371 -0.069634 -0.085232 -0.175628 -0.150774 0.200807 -0.253193 0.033911 -0.045572 0.047166
1 2.383298 -3.753459 -0.580229 -1.127438 -0.624850 0.126616 -0.288270 0.044935 0.426916 -0.659939 ... 0.075191 0.091740 0.213925 0.010368 -0.170210 0.042420 0.180649 -0.032630 0.005902 0.001845
2 5.742472 -1.080350 -0.533088 -0.903470 0.180601 0.401855 0.463188 -0.715209 -0.010712 -0.082305 ... -0.303285 0.058930 0.074145 0.103834 0.170749 -0.005066 0.049887 -0.047022 -0.003290 -0.000735
3 7.124384 10.272225 -3.150161 -0.121512 2.968055 2.561668 1.950177 1.287990 1.272638 -1.171376 ... -0.410865 0.205120 0.135322 0.158659 0.075684 0.273048 0.184350 -0.042465 0.068640 0.020001
4 3.945694 -1.959689 1.401177 -2.937555 -0.540853 -1.233300 -0.205435 -0.959329 0.629119 -0.166354 ... 0.117341 0.020423 -0.135431 -0.004874 0.002884 -0.039637 0.032586 0.034790 -0.005183 -0.021199

5 rows × 31 columns

In [67]:
# Print the number of components and the explained variance ratio
num_components = pca.n_components_
explained_variance_ratio = pca.explained_variance_ratio_
In [81]:
# Create DataFrame for explained variance ratio
explained_variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(num_components)],
    'Explained Variance Ratio': explained_variance_ratio
})

print("Number of Principal Components:", num_components)
print("\nExplained Variance Ratio of each Principal Component:")
explained_variance_df.head(7)
Number of Principal Components: 31

Explained Variance Ratio of each Principal Component:
Out[81]:
Principal Component Explained Variance Ratio
0 PC1 0.428647
1 PC2 0.183768
2 PC3 0.091464
3 PC4 0.063915
4 PC5 0.053188
5 PC6 0.039828
6 PC7 0.031557
In [143]:
# Cumulative Explained Variance Ratio
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
cumulative_variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(num_components)],
    'Explained Variance Ratio': explained_variance_ratio,
    'Cumulative Explained Variance': cumulative_explained_variance
})

print("\nCumulative Explained Variance Ratio:")
cumulative_variance_df.head(7)
Cumulative Explained Variance Ratio:
Out[143]:
Principal Component Explained Variance Ratio Cumulative Explained Variance
0 PC1 0.428647 0.428647
1 PC2 0.183768 0.612415
2 PC3 0.091464 0.703879
3 PC4 0.063915 0.767794
4 PC5 0.053188 0.820982
5 PC6 0.039828 0.860810
6 PC7 0.031557 0.892367

5.5 | Plotting PCA By Different Methods

In [142]:
# Principal Component Variance Explained Plot
plt.figure(figsize=(11, 4))
plt.bar(range(1, num_components + 1), explained_variance_ratio, color='skyblue')
plt.title('Variance Explained by Each Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.xticks(range(1, num_components + 1))
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()
In [114]:
# Scree Plot
plt.figure(figsize=(11, 4))
plt.plot(range(1, num_components + 1), explained_variance_ratio, marker='o', linestyle='--', color='yellow')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)
plt.xticks(range(1, num_components + 1))
plt.show()
In [113]:
# Cumulative Explained Variance Plot
plt.figure(figsize=(11, 4))
plt.plot(range(1, num_components + 1), cumulative_explained_variance, marker='o', linestyle='-', color='r')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.xticks(range(1, num_components + 1))
plt.show()
In [112]:
# Combined Scree Plot
plt.figure(figsize=(11, 4))
plt.plot(range(1, num_components + 1), explained_variance_ratio, marker='o', linestyle='--', color='yellow', label='Explained Variance')
plt.plot(range(1, num_components + 1), cumulative_explained_variance, marker='o', linestyle='-', color='r', label='Cumulative Explained Variance')
plt.title('Explained and Cumulative Variance')
plt.xlabel('Principal Component')
plt.ylabel('Variance')
plt.legend()
plt.grid(True)
plt.show()
In [131]:
 #Combine PC1 and PC2 for the color gradient
pca_df['Color'] = np.sqrt(pca_df['PC1']**2 + pca_df['PC2']**2)  # Magnitude of the vector

# Define a colormap for the gradient
cmap = plt.get_cmap('viridis')

# Create the scatter plot with gradient colors
plt.figure(figsize=(11, 6))
scatter = plt.scatter(
    pca_df['PC1'], pca_df['PC2'],
    c=pca_df['Color'], cmap=cmap,
    alpha=0.8, edgecolors='k', s=100
)
plt.colorbar(scatter, label='Combined PC1 and PC2 Value')
plt.xlabel('First Principal Component (PC1)')
plt.ylabel('Second Principal Component (PC2)')
plt.title('Scatter Plot of PC1 vs. PC2 with Combined Color Gradient')
plt.grid(True)
plt.show()
In [139]:
# Calculate the magnitude for color gradient
pca_df['Magnitude'] = np.sqrt(pca_df['PC1']**2 + pca_df['PC2']**2 + pca_df['PC3']**2)

# Define a colormap
cmap = plt.get_cmap('coolwarm')

# Create 3D scatter plot
fig = plt.figure(figsize=(12, 7))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(
    pca_df['PC1'], pca_df['PC2'], pca_df['PC3'],
    c=pca_df['Magnitude'], cmap=cmap, edgecolor='k', s=50
)
fig.colorbar(scatter, ax=ax, label='Magnitude of PC1, PC2, and PC3')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('3D Scatter Plot of First Three Principal Components with Magnitude Color Gradient')
plt.show()
In [147]:
explained_variance = pca.explained_variance_ratio_

# Get the indices of the best (most significant) and worst (least significant) components based on explained variance
best_index = np.argmax(explained_variance)  # Index of the component with the highest explained variance
worst_index = np.argmin(explained_variance)  # Index of the component with the lowest explained variance

# Plot Loading Scores of Best and Worst Components
plt.figure(figsize=(9, 5))
plt.plot(pca.components_[best_index, :], label=f'Best Component (PC{best_index+1}) - Variance: {explained_variance[best_index]:.2f}')
plt.plot(pca.components_[worst_index, :], label=f'Worst Component (PC{worst_index+1}) - Variance: {explained_variance[worst_index]:.2f}')
plt.title('Loading Scores of Best and Worst Principal Components')
plt.xlabel('Feature Index')
plt.ylabel('Loading Score')
plt.legend(loc='best')
plt.grid(True)
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]: